#Loading packages
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from IPython.display import HTML
#Loading dataset
df = pd.read_csv("C:/Users/Nathan/Documents/Portfolio/Portfolio Data/World Population EDA/world_population.csv")
#Transforming Data
years = ['1970 Population','1980 Population','1990 Population','2000 Population','2010 Population','2020 Population']
#Need to consolidate all "YEAR Population" columns into two variables containing years and corresponding populations
#Copy data
df_by_year = df
#"Melt" selected decade populations into two columns: Year and Population while retaining Country and Continent data
df_by_year = pd.melt(df_by_year,id_vars=['Country','Continent'],value_vars=years)
#Rename newly made columns
df_by_year = df_by_year.rename(columns={'variable':'Year','value':'Population'})
#Drop extra wording from Year data
df_by_year['Year'] = df_by_year['Year'].str.replace(' Population','')
#Sort data by population year then population
df_by_year = df_by_year.sort_values(by=['Year','Population'])
#Determining the number of rows and columns in the data
df.shape
(234, 17)
df.head(5)
| Rank | CCA3 | Country | Capital | Continent | 2022 Population | 2020 Population | 2015 Population | 2010 Population | 2000 Population | 1990 Population | 1980 Population | 1970 Population | Area (km²) | Density (per km²) | Growth Rate | World Population Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 | AFG | Afghanistan | Kabul | Asia | 41128771 | 38972230 | 33753499 | 28189672 | 19542982 | 10694796 | 12486631 | 10752971 | 652230 | 63.0587 | 1.0257 | 0.52 |
| 1 | 138 | ALB | Albania | Tirana | Europe | 2842321 | 2866849 | 2882481 | 2913399 | 3182021 | 3295066 | 2941651 | 2324731 | 28748 | 98.8702 | 0.9957 | 0.04 |
| 2 | 34 | DZA | Algeria | Algiers | Africa | 44903225 | 43451666 | 39543154 | 35856344 | 30774621 | 25518074 | 18739378 | 13795915 | 2381741 | 18.8531 | 1.0164 | 0.56 |
| 3 | 213 | ASM | American Samoa | Pago Pago | Oceania | 44273 | 46189 | 51368 | 54849 | 58230 | 47818 | 32886 | 27075 | 199 | 222.4774 | 0.9831 | 0.00 |
| 4 | 203 | AND | Andorra | Andorra la Vella | Europe | 79824 | 77700 | 71746 | 71519 | 66097 | 53569 | 35611 | 19860 | 468 | 170.5641 | 1.0100 | 0.00 |
The dataset contains population information on 234 different countries and territories within 17 variables:
Rank : Ranking from 1 to 234 for each country/territory based on current population sizeCCA3 : The three digit country/territory code associated with each country/territoryCountry : The name of the country/territoryCapital : The name of the capital for the given country/territoryContinent : The name of the continent for the given country/territory (Asia, Africa, Europe, North America, South America, and Oceania)2022 Population : The 2022 population of the given country/territory2020 Population : The 2020 population of the given country/territory2015 Population : The 2015 population of the given country/territory2010 Population : The 2010 population of the given country/territory2000 Population : The 2000 population of the given country/territory1990 Population : The 1990 population of the given country/territory1980 Population : The 1980 population of the given country/territory 1970 Population : The 1970 population of the given country/territory Area (km²) : The land mass of the given country/territory in km²Denisty (per km²) : The population density of the given country/territory in people per km²Growth Rate : The current population growth rate of the given country/territoryWorld Population Percentage : The percentage of the total world population residing in the given country/territoryStart by ensuring that every column is complete and contains no missing values. This can be done by counting the number of instances that occur in each variable column and comparing it with the known length of the dataset.
A quick and easy way to do this is by generating a bar chart.
#Check each column in the data for missing values
missing = df.notnull().sum(axis=0)
#Generate Bar plot for each variable in the dataset counting the number of non-missing instances
fig = px.bar(x=missing.index,
y=missing.values,
text=missing.values, #include the number of non-missing values inside each bar on the plot for clarity
title='World Population Dataset: Total Number of Data Points (out of 234 rows)'
)
fig.update_traces(hovertemplate='<br> Variable: %{x} </br> Number of Instances: %{y}')
fig.update_layout(xaxis_title='Dataset Variables',yaxis_title='Number of Instances')
fig.show(renderer='notebook')
Since every column contains 234 instances, we can assume there are no missing values in the data.
To better understand the structure of the data, we may wish to know how many countries and territories are being accounted for in each continent. This can also be done using a bar chart by grouping and counting the number of Country instances within each Continent level.
#Count the number of countries/territories in each continent
country_counts = df.groupby('Continent')['Country'].count()
#Generate
fig = px.bar(x=country_counts.values,
y=country_counts.index,
color=country_counts.index, #add color to easily differentiate between each continent
text=country_counts.values, #include the number of countries/territories in each continent on bar plot
color_discrete_sequence=px.colors.sequential.Peach[::-1],
title='World Population Data: Number of Countries & Territories per Continent')
fig.update_traces(hovertemplate='<br> Continent: %{y} </br> Number of Countries Included: %{x}') #edit hovertext for clarity
fig.update_layout(xaxis_title='Number of Countries & Territories',yaxis_title='Continent')
fig.show()
We can begin by looking at the total world population across the previous five decades to get to start to understand the change over time. Since the data already contains information on the Continent and Country or territory, we can use a filled area plot to see the total world population along with each continent's and country's contribution.
#Generate filled area plot for total population along with continent and country breakdown
px.area(df_by_year,x='Year',y='Population',color='Continent',line_group='Country',
color_discrete_sequence=px.colors.sequential.Agsunset)
The data contains two variables which have a clear link to one another as well as a population: Area (km²) and Density (per km²)
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='Area (km²)',
color_continuous_scale=px.colors.sequential.Bluyl,
title='Total Land Mass by Country',
template='ggplot2',
projection='natural earth'
)
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
fig.show()
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='Density (per km²)',
color_continuous_scale=px.colors.sequential.OrRd,
title='Total Population Density by Country',
template='ggplot2',
range_color=(0,150),
projection='natural earth'
)
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
example = fig
fig.show()
Since the data contains information on the world population split by each country and territory's contribution, it may be easier to get a general understanding by visualizing each countries and territories impact by viewing the information on a choropleth map. This makes it possible to view the aggregated population data in a geographic way.
Lets start by looking at a map of the current 2022 Population:
#Generate a choropleth map to visual the current world population
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='2022 Population',
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000),
title='World Map Indicating 2022 Populations',
template='ggplot2',
projection='natural earth')
#Set up coloration and grids for clarity on borders and position
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
Next, viewing the same map as an animation over the past 50 years can help see if there are any obvious trends in population growth in specific regions across the globe while simultaneously being able to compare them with the rest of the world.
#Generate a choropleth animation showing the change in country/territory populations for each decade from 1970-2020
fig = px.choropleth(data_frame=df_by_year,
locations='Country', #Column used to identify countries
locationmode='country names',
color='Population', #Column used to identify color intensity
animation_frame='Year', #Column used to idenify each animation frame
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000), #set limit on colorscale, India and China are so large the rest seem unchanged
title='World Map Indicating Populations: 1970 - 2020',
template='ggplot2',
projection='natural earth') #change map projection type to Winkel Tripel
#Include longitude, latitude, and water markings for clarify and visual appeal
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
Since roughly half of the current world population reside in China, India, and Pakistan alone, we isolate Asia to take a closer look at its changes over the past 50 years.
#Generate the same choropleth map as before, this time changing the scope of the map from the entire world to only Asia
fig = px.choropleth(data_frame=df_by_year,
locations='Country',
locationmode='country names',
color='Population',
animation_frame='Year',
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000),
title='Asia Population Density Map: 1970 - 2020',
template='ggplot2',
scope = 'asia')
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
fig.show()
Taking a closer look at each continents contribution to world population levels, we can start by looking at the 2022 Population and see the average population sizes of their corresponding territories and countries.
#Find the average population by continent and sort values in a descending order
continent_populations = df.groupby('Continent')['2022 Population'].mean().round().sort_values(ascending=False)
#Generate a bar plot to visual each continent's average population
fig=px.bar(x=continent_populations.index,
y=continent_populations.values,
color=continent_populations.index, #color each continents bar according to its population
text=continent_populations.values, #include average population amount on each bar for clarity
color_discrete_sequence=px.colors.sequential.Darkmint[::-1],
title='Average 2022 Population by Continent'
)
fig.update_layout(xaxis_title='Continents',
yaxis_title='Average Population Count')
fig.update_traces(hovertemplate='<br> Continent: %{x} </br> Average Population: %{y}') #update hover information for inspection
fig.show()
Aside from the current world population numbers, inspecting average population sizes from the past few decades could help identify any apparent traits. From the data, we can pull the 2020 Population,2010 Population,2000 Population,1990 Population,1980 Population,and 1970 Population numbers and generate similar plots for reference.
years = ['2020 Population','2010 Population', '2000 Population', '1990 Population','1980 Population', '1970 Population']
#Calculate the average country/territory population for each continent
cont_pop_years = df.groupby('Continent').mean().round()
#Drop unneeded variables
cont_pop_years = cont_pop_years.drop(['Area (km²)', 'Density (per km²)', 'Growth Rate','World Population Percentage'],axis=1)
#Reintroduce continent are variable
cont_pop_years.reset_index(inplace=True)
#Combine all the population year variables into two variables: one containing the year and one containing the corresponding pop.
cont_pop_years = pd.melt(cont_pop_years,id_vars=['Continent'],value_vars=years)
cont_pop_years = cont_pop_years.rename(columns={'variable':'Year','value':'Population'})
#Remove ' Population' from Year variable and just leave year
cont_pop_years['Year'] = cont_pop_years['Year'].str.replace(' Population','')
#Reorganize the order of the data to match earlier plot to see progression more easily
cat = ['Asia','South America','Africa','North America','Europe','Oceania']
cont_pop_years['Continent'] = pd.Categorical(cont_pop_years['Continent'],categories=cat)
cont_pop_years = cont_pop_years.sort_values(by=['Continent','Year'])
#Generate animated bar plot of average country population by continent the change over decades
fig=px.bar(cont_pop_years,
x='Continent',
y='Population',
color='Continent',
color_discrete_sequence=px.colors.sequential.Darkmint[::-1],
title='Average Population by Continent: 1970 - 2020',
animation_frame='Year',
range_y=[0,95000000]
)
#Rename axes and hover infromation for clarity
fig.update_layout(xaxis_title='Continents', yaxis_title='Population Count')
fig.update_traces(hovertemplate='<br> Continent: %{x} </br> Average Population: %{y}')
fig.show()
Aside from the average values, we can compare the total population sizes of each continent to one another as well as see their percentage level contributions to the total world population using a pie chart. For a baseline, we can first inspect the levels from the 2022 Population sizes.
#Generate a pie chart to visualize the proportion each continent contributes to the current total world population
fig = px.pie(df,
values='2022 Population',
names='Continent',
color_discrete_sequence=px.colors.sequential.Magenta[::-1],
title='Total 2022 Population by Continent',
hole=0.25
)
fig.update_traces(textinfo='label+percent+value') #update each slices information for better readability
fig.show()
Also using the 2020 Population,2010 Population,2000 Population,1990 Population,1980 Population,and 1970 Population data, we can generate similar visuals for their total population sizes and see how certain continents and their overall contributions have shifted over time.
populations = [['1970 Population','1980 Population'],['1990 Population','2000 Population'],
['2010 Population','2020 Population']]
i=1
fig = make_subplots(rows=3,cols=2,specs=[[{'type':'domain'},{'type':'domain'}],[{'type':'domain'},{'type':'domain'}],
[{'type':'domain'},{'type':'domain'}]],
subplot_titles = ['Total 1970 Population','Total 1980 Population','Total 1990 Population',
'Total 2000 Population','Total 2010 Population','Total 2020 Population'],
horizontal_spacing=0.3,
column_widths=[0.5,0.5],vertical_spacing=0)
for pop in populations:
fig.add_trace(
go.Pie(values = df.groupby('Continent')[pop[0]].sum().sort_values(ascending=False),
labels = df.groupby('Continent')[pop[0]].sum().sort_values(ascending=False).index,
name='',
marker=dict(colors=px.colors.sequential.Magenta[::-1]),
texttemplate='<br>%{label}</br>%{percent}</br>%{value}',
textposition='outside',
pull = [0,0.1,0.1,0,0,0]
),
row=i,
col=1
)
fig.add_trace(
go.Pie(values = df.groupby('Continent')[pop[1]].sum().sort_values(ascending=False),
labels = df.groupby('Continent')[pop[1]].sum().sort_values(ascending=False).index,
name='',
marker=dict(colors=px.colors.sequential.Magenta[::-1]),
texttemplate='<br>%{label}</br>%{percent}</br>%{value}',
textposition='outside',
pull = [0,0.1,0.1,0,0,0]
),
row=i,
col=2
)
i+=1
fig.update_layout(height=1500,showlegend=False,title='Total World Population: 1970 - 2020')
fig.show()
After looking at each continent as a whole, we can identify which countries and territories have contributed the most and least to the current world population.
#Calculate the top 5 most populated countries/territories in 2022
mostpop_country = df.groupby('Country')['2022 Population'].sum().sort_values(ascending=False).head(5)
#Generate a bar plot of the top 5 most populated countries of 2022
fig = px.bar(x=mostpop_country.index,
y=mostpop_country.values,
color_discrete_sequence = ['MidnightBlue'],
text=mostpop_country.values,
title='Top 5 Most Populated Countries According to 2022 Population'
)
#Update axis titles for clarity
fig.update_layout(xaxis_title='Country', yaxis_title='Population')
#Update hover information for addition clarity
fig.update_traces(hovertemplate='<br> Country: %{x} </br> 2022 Population: %{y}')
fig.show()
#Calculate the top 5 least populated countries/territories of 2022
leastpop_country = df.groupby('Country')['2022 Population'].sum().sort_values(ascending=True).head(5)
leastpop_country = leastpop_country.sort_values(ascending=False)
#Generate a bar plot of the top 5 least populated countries/territories of 2022
fig = px.bar(x=leastpop_country.index,
y=leastpop_country.values,
color_discrete_sequence = ['IndianRed'],
text=leastpop_country.values,
title= 'Top 5 Least Populated Countires According to 2022 Population'
)
#Update axis titles for clarity
fig.update_layout(xaxis_title='Country', yaxis_title='Population')
#Update hover information for additional clarity
fig.update_traces(hovertemplate='<br> Country: %{x} </br> 2022 Population: %{y}')
fig.show()
We can try to determine if there are any changes in these rankings by looking at the most and least populated countries and territories of the past 50 years.
#Generate list of decades to cycle through for plotting
populations = ['2020 Population','2010 Population','2000 Population','1990 Population','1980 Population','1970 Population']
#Set list to chronological order
populations.reverse()
#Using a for loop, plot the top 5 least and most populated countries/territories for each decade over the past 50 years
for pop in populations:
#Calculate the least and most populated countries/territories of each decade
mostpop_country = df.groupby('Country')[pop].sum().sort_values(ascending=False).head(5)
leastpop_country = df.groupby('Country')[pop].sum().sort_values(ascending=True).head(5)
leastpop_country = leastpop_country.sort_values(ascending=False)
#Generate subplot matrix for each decade
fig = make_subplots(rows=1,cols=2,specs=[[{'type':'xy'},{'type':'xy'}]],
subplot_titles=['Top 5 Most Populated Countries','Top 5 Least Populated Countries'],
y_title='Population', x_title='Countries')
#Generate bar chart for top 5 most populated countries of the decade
fig.add_trace(
go.Bar(x=mostpop_country.index,
y=mostpop_country.values,
texttemplate='%{y}',
name='',
showlegend=False,
marker=dict(color='MidnightBlue')
),
row=1,
col=1
)
#Generate bar chart for top 5 least populate countries of the decade
fig.add_trace(
go.Bar(x=leastpop_country.index,
y=leastpop_country.values,
texttemplate='%{y}',
name='',
showlegend=False,
marker=dict(color='IndianRed')
),
row=1,
col=2
)
#Include a main title indicating which decade the subplots are refering to
fig.update_layout(title_text=pop)
fig.show()